{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!pip install \"modin[ray]\" -q\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import time\n",
        "import os\n",
        "from typing import Dict, Any\n",
        "\n",
        "import modin.pandas as mpd\n",
        "import ray\n",
        "\n",
        "ray.init(ignore_reinit_error=True, num_cpus=2)\n",
        "print(f\"Ray initialized with {ray.cluster_resources()}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "OBZkMMqx9QcN",
        "outputId": "5be43d42-3b5c-49bf-f3cf-8e99e094054b"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.9/68.9 MB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m34.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "2025-07-01 17:11:21,502\tINFO worker.py:1917 -- Started a local Ray instance.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Ray initialized with {'CPU': 2.0, 'node:__internal_head__': 1.0, 'node:172.28.0.12': 1.0, 'memory': 9341988455.0, 'object_store_memory': 4003709337.0}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def benchmark_operation(pandas_func, modin_func, data, operation_name: str) -> Dict[str, Any]:\n",
        "    \"\"\"Compare pandas vs modin performance\"\"\"\n",
        "\n",
        "    start_time = time.time()\n",
        "    pandas_result = pandas_func(data['pandas'])\n",
        "    pandas_time = time.time() - start_time\n",
        "\n",
        "    start_time = time.time()\n",
        "    modin_result = modin_func(data['modin'])\n",
        "    modin_time = time.time() - start_time\n",
        "\n",
        "    speedup = pandas_time / modin_time if modin_time > 0 else float('inf')\n",
        "\n",
        "    print(f\"\\n{operation_name}:\")\n",
        "    print(f\"  Pandas: {pandas_time:.3f}s\")\n",
        "    print(f\"  Modin:  {modin_time:.3f}s\")\n",
        "    print(f\"  Speedup: {speedup:.2f}x\")\n",
        "\n",
        "    return {\n",
        "        'operation': operation_name,\n",
        "        'pandas_time': pandas_time,\n",
        "        'modin_time': modin_time,\n",
        "        'speedup': speedup\n",
        "    }"
      ],
      "metadata": {
        "id": "wygzNKEa9Rub"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def create_large_dataset(rows: int = 1_000_000):\n",
        "    \"\"\"Generate synthetic dataset for testing\"\"\"\n",
        "    np.random.seed(42)\n",
        "\n",
        "    data = {\n",
        "        'customer_id': np.random.randint(1, 50000, rows),\n",
        "        'transaction_amount': np.random.exponential(50, rows),\n",
        "        'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books', 'Sports'], rows),\n",
        "        'region': np.random.choice(['North', 'South', 'East', 'West'], rows),\n",
        "        'date': pd.date_range('2020-01-01', periods=rows, freq='H'),\n",
        "        'is_weekend': np.random.choice([True, False], rows, p=[0.3, 0.7]),\n",
        "        'rating': np.random.uniform(1, 5, rows),\n",
        "        'quantity': np.random.poisson(3, rows) + 1,\n",
        "        'discount_rate': np.random.beta(2, 5, rows),\n",
        "        'age_group': np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'], rows)\n",
        "    }\n",
        "\n",
        "    pandas_df = pd.DataFrame(data)\n",
        "    modin_df = mpd.DataFrame(data)\n",
        "\n",
        "    print(f\"Dataset created: {rows:,} rows × {len(data)} columns\")\n",
        "    print(f\"Memory usage: {pandas_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB\")\n",
        "\n",
        "    return {'pandas': pandas_df, 'modin': modin_df}\n",
        "\n",
        "dataset = create_large_dataset(500_000)\n",
        "\n",
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"ADVANCED MODIN OPERATIONS BENCHMARK\")\n",
        "print(\"=\"*60)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "D6rxRoE59YkP",
        "outputId": "b188441b-54e8-44c4-ca38-c6e2e35ee3af"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Dataset created: 500,000 rows × 10 columns\n",
            "Memory usage: 112.5 MB\n",
            "\n",
            "============================================================\n",
            "ADVANCED MODIN OPERATIONS BENCHMARK\n",
            "============================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def complex_groupby(df):\n",
        "    return df.groupby(['category', 'region']).agg({\n",
        "        'transaction_amount': ['sum', 'mean', 'std', 'count'],\n",
        "        'rating': ['mean', 'min', 'max'],\n",
        "        'quantity': 'sum'\n",
        "    }).round(2)\n",
        "\n",
        "groupby_results = benchmark_operation(\n",
        "    complex_groupby, complex_groupby, dataset, \"Complex GroupBy Aggregation\"\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wwptNjEo9hyi",
        "outputId": "98146627-9210-4f66-920e-d060da4dc556"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Complex GroupBy Aggregation:\n",
            "  Pandas: 0.346s\n",
            "  Modin:  6.456s\n",
            "  Speedup: 0.05x\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def advanced_cleaning(df):\n",
        "    df_clean = df.copy()\n",
        "\n",
        "    Q1 = df_clean['transaction_amount'].quantile(0.25)\n",
        "    Q3 = df_clean['transaction_amount'].quantile(0.75)\n",
        "    IQR = Q3 - Q1\n",
        "    df_clean = df_clean[\n",
        "        (df_clean['transaction_amount'] >= Q1 - 1.5 * IQR) &\n",
        "        (df_clean['transaction_amount'] <= Q3 + 1.5 * IQR)\n",
        "    ]\n",
        "\n",
        "    df_clean['transaction_score'] = (\n",
        "        df_clean['transaction_amount'] * df_clean['rating'] * df_clean['quantity']\n",
        "    )\n",
        "    df_clean['is_high_value'] = df_clean['transaction_amount'] > df_clean['transaction_amount'].median()\n",
        "\n",
        "    return df_clean\n",
        "\n",
        "cleaning_results = benchmark_operation(\n",
        "    advanced_cleaning, advanced_cleaning, dataset, \"Advanced Data Cleaning\"\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1o-Zbnwn9kxy",
        "outputId": "97445c33-2752-43be-95de-767a7a641a4c"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Advanced Data Cleaning:\n",
            "  Pandas: 0.148s\n",
            "  Modin:  6.948s\n",
            "  Speedup: 0.02x\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def time_series_analysis(df):\n",
        "    df_ts = df.copy()\n",
        "    df_ts = df_ts.set_index('date')\n",
        "\n",
        "    daily_sum = df_ts.groupby(df_ts.index.date)['transaction_amount'].sum()\n",
        "    daily_mean = df_ts.groupby(df_ts.index.date)['transaction_amount'].mean()\n",
        "    daily_count = df_ts.groupby(df_ts.index.date)['transaction_amount'].count()\n",
        "    daily_rating = df_ts.groupby(df_ts.index.date)['rating'].mean()\n",
        "\n",
        "    daily_stats = type(df)({\n",
        "        'transaction_sum': daily_sum,\n",
        "        'transaction_mean': daily_mean,\n",
        "        'transaction_count': daily_count,\n",
        "        'rating_mean': daily_rating\n",
        "    })\n",
        "\n",
        "    daily_stats['rolling_mean_7d'] = daily_stats['transaction_sum'].rolling(window=7).mean()\n",
        "\n",
        "    return daily_stats\n",
        "\n",
        "ts_results = benchmark_operation(\n",
        "    time_series_analysis, time_series_analysis, dataset, \"Time Series Analysis\"\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Tl8dKveA9pvB",
        "outputId": "bcf63e67-81fb-4179-a4d3-3c7dc1ebef61"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Time Series Analysis:\n",
            "  Pandas: 0.971s\n",
            "  Modin:  4.298s\n",
            "  Speedup: 0.23x\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def create_lookup_data():\n",
        "    \"\"\"Create lookup tables for joins\"\"\"\n",
        "    categories_data = {\n",
        "        'category': ['Electronics', 'Clothing', 'Food', 'Books', 'Sports'],\n",
        "        'commission_rate': [0.15, 0.20, 0.10, 0.12, 0.18],\n",
        "        'target_audience': ['Tech Enthusiasts', 'Fashion Forward', 'Food Lovers', 'Readers', 'Athletes']\n",
        "    }\n",
        "\n",
        "    regions_data = {\n",
        "        'region': ['North', 'South', 'East', 'West'],\n",
        "        'tax_rate': [0.08, 0.06, 0.09, 0.07],\n",
        "        'shipping_cost': [5.99, 4.99, 6.99, 5.49]\n",
        "    }\n",
        "\n",
        "    return {\n",
        "        'pandas': {\n",
        "            'categories': pd.DataFrame(categories_data),\n",
        "            'regions': pd.DataFrame(regions_data)\n",
        "        },\n",
        "        'modin': {\n",
        "            'categories': mpd.DataFrame(categories_data),\n",
        "            'regions': mpd.DataFrame(regions_data)\n",
        "        }\n",
        "    }\n",
        "\n",
        "lookup_data = create_lookup_data()"
      ],
      "metadata": {
        "id": "kr7NPA4-9sJ7"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def advanced_joins(df, lookup):\n",
        "    result = df.merge(lookup['categories'], on='category', how='left')\n",
        "    result = result.merge(lookup['regions'], on='region', how='left')\n",
        "\n",
        "    result['commission_amount'] = result['transaction_amount'] * result['commission_rate']\n",
        "    result['tax_amount'] = result['transaction_amount'] * result['tax_rate']\n",
        "    result['total_cost'] = result['transaction_amount'] + result['tax_amount'] + result['shipping_cost']\n",
        "\n",
        "    return result\n",
        "\n",
        "join_results = benchmark_operation(\n",
        "    lambda df: advanced_joins(df, lookup_data['pandas']),\n",
        "    lambda df: advanced_joins(df, lookup_data['modin']),\n",
        "    dataset,\n",
        "    \"Advanced Joins & Calculations\"\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZDtri8In9u3H",
        "outputId": "73bc400a-5bf7-479f-c8cd-3696d56feaf1"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Advanced Joins & Calculations:\n",
            "  Pandas: 0.424s\n",
            "  Modin:  0.281s\n",
            "  Speedup: 1.51x\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"MEMORY EFFICIENCY COMPARISON\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "def get_memory_usage(df, name):\n",
        "    \"\"\"Get memory usage of dataframe\"\"\"\n",
        "    if hasattr(df, '_to_pandas'):\n",
        "        memory_mb = df.memory_usage(deep=True).sum() / 1024**2\n",
        "    else:\n",
        "        memory_mb = df.memory_usage(deep=True).sum() / 1024**2\n",
        "\n",
        "    print(f\"{name} memory usage: {memory_mb:.1f} MB\")\n",
        "    return memory_mb\n",
        "\n",
        "pandas_memory = get_memory_usage(dataset['pandas'], \"Pandas\")\n",
        "modin_memory = get_memory_usage(dataset['modin'], \"Modin\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "aWTg483Z9wyL",
        "outputId": "cd9a25f1-09eb-4adc-972c-af3cbebfcfc2"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "============================================================\n",
            "MEMORY EFFICIENCY COMPARISON\n",
            "============================================================\n",
            "Pandas memory usage: 112.5 MB\n",
            "Modin memory usage: 112.5 MB\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1Qjot8f37X-H",
        "outputId": "88e97027-3aff-48f2-ee1f-8889bf143d1c"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "============================================================\n",
            "PERFORMANCE SUMMARY\n",
            "============================================================\n",
            "\n",
            "Average Speedup: 0.45x\n",
            "Best Operation: Advanced Joins & Calculations (1.51x)\n",
            "\n",
            "Detailed Results:\n",
            "  Complex GroupBy Aggregation: 0.05x speedup\n",
            "  Advanced Data Cleaning: 0.02x speedup\n",
            "  Time Series Analysis: 0.23x speedup\n",
            "  Advanced Joins & Calculations: 1.51x speedup\n",
            "\n",
            "============================================================\n",
            "MODIN BEST PRACTICES\n",
            "============================================================\n",
            "1. Use 'import modin.pandas as pd' to replace pandas completely\n",
            "2. Modin works best with operations on large datasets (>100MB)\n",
            "3. Ray backend is most stable; Dask for distributed clusters\n",
            "4. Some pandas functions may fall back to pandas automatically\n",
            "5. Use .to_pandas() to convert Modin DataFrame to pandas when needed\n",
            "6. Profile your specific workload - speedup varies by operation type\n",
            "7. Modin excels at: groupby, join, apply, and large data I/O operations\n",
            "\n",
            "✅ Tutorial completed successfully!\n",
            "🚀 Modin is now ready to scale your pandas workflows!\n"
          ]
        }
      ],
      "source": [
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"PERFORMANCE SUMMARY\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "results = [groupby_results, cleaning_results, ts_results, join_results]\n",
        "avg_speedup = sum(r['speedup'] for r in results) / len(results)\n",
        "\n",
        "print(f\"\\nAverage Speedup: {avg_speedup:.2f}x\")\n",
        "print(f\"Best Operation: {max(results, key=lambda x: x['speedup'])['operation']} \"\n",
        "      f\"({max(results, key=lambda x: x['speedup'])['speedup']:.2f}x)\")\n",
        "\n",
        "print(\"\\nDetailed Results:\")\n",
        "for result in results:\n",
        "    print(f\"  {result['operation']}: {result['speedup']:.2f}x speedup\")\n",
        "\n",
        "print(\"\\n\" + \"=\"*60)\n",
        "print(\"MODIN BEST PRACTICES\")\n",
        "print(\"=\"*60)\n",
        "\n",
        "best_practices = [\n",
        "    \"1. Use 'import modin.pandas as pd' to replace pandas completely\",\n",
        "    \"2. Modin works best with operations on large datasets (>100MB)\",\n",
        "    \"3. Ray backend is most stable; Dask for distributed clusters\",\n",
        "    \"4. Some pandas functions may fall back to pandas automatically\",\n",
        "    \"5. Use .to_pandas() to convert Modin DataFrame to pandas when needed\",\n",
        "    \"6. Profile your specific workload - speedup varies by operation type\",\n",
        "    \"7. Modin excels at: groupby, join, apply, and large data I/O operations\"\n",
        "]\n",
        "\n",
        "for tip in best_practices:\n",
        "    print(tip)\n",
        "\n",
        "ray.shutdown()\n",
        "print(\"\\n✅ Tutorial completed successfully!\")\n",
        "print(\"🚀 Modin is now ready to scale your pandas workflows!\")"
      ]
    }
  ]
}